library(corrplot) # for correlation matix
library(stargazer) # visualize model fit
library(skimr) # generate summary statistics
library(car) # statistics
library(lmtest) # linear modeling
library(sandwich)
library(tidyverse) # for data import, manipulation, viz
filter <- dplyr::filter
data2$acj = data2$prbarr * data2$prbconv * data2$prbpris
data2$west_proper = ifelse(data2$west == 1 & data2$central != 1, 1, 0)
df = data2 %>%
mutate(
acj = prbarr * prbconv * prbpris
, west_proper = ifelse(west == 1 & central != 1, 1, 0)
, central_proper = ifelse(west != 1 & central == 1, 1, 0)
, num_minorities = density * pctmin80
, num_ymales = density * pctymle
) %>%
mutate(
ln_crmrte = log(crmrte)
, ln_acj = log(prbarr * prbconv * polpc)
, ln_wloc = log(wloc)
, ln_wtrd = log(wtrd)
, ln_density = log(density)
, ln_taxpc = log(taxpc)
, ln_pctymle = log(pctymle)
) %>%
mutate(
acj = prbarr * prbconv * polpc
, west_proper = ifelse(west == 1 & central != 1, 1, 0)
, central_proper = ifelse(west != 1 & central == 1, 1, 0)
) %>%
select(
ln_crmrte
, ln_acj
, ln_wloc
, ln_wtrd
, ln_taxpc
, ln_density
, pctmin80
, ln_pctymle
, num_minorities
, num_ymales
, urban
, west_proper
)
cor(df)
ln_crmrte ln_acj ln_wloc ln_wtrd ln_taxpc ln_density pctmin80 ln_pctymle num_minorities num_ymales urban west_proper
ln_crmrte 1.0000000 -0.39933207 0.30286706 0.38965891 0.33984322 0.49364251 0.23291821 0.31175403 0.6635139 0.66759236 0.49146445 -0.45141763
ln_acj -0.3993321 1.00000000 0.18125219 -0.04015954 -0.02452981 -0.25189220 0.05632914 -0.24887027 -0.1643648 -0.25482183 -0.16861495 0.15187249
ln_wloc 0.3028671 0.18125219 1.00000000 0.57419433 0.22092405 0.30295286 -0.10213445 0.02258912 0.3918107 0.43137206 0.33004924 -0.15296513
ln_wtrd 0.3896589 -0.04015954 0.57419433 1.00000000 0.16350047 0.42921817 -0.07527956 -0.09784808 0.4839622 0.48656695 0.39889779 -0.19939002
ln_taxpc 0.3398432 -0.02452981 0.22092405 0.16350047 1.00000000 0.10798692 0.02947733 -0.07360943 0.3520511 0.28828811 0.39785937 -0.19215618
ln_density 0.4936425 -0.25189220 0.30295286 0.42921817 0.10798692 1.00000000 -0.09668387 0.17515611 0.4816689 0.56497500 0.39272648 -0.25035461
pctmin80 0.2329182 0.05632914 -0.10213445 -0.07527956 0.02947733 -0.09668387 1.00000000 -0.01224664 0.2848787 -0.05069759 0.01619569 -0.62451443
ln_pctymle 0.3117540 -0.24887027 0.02258912 -0.09784808 -0.07360943 0.17515611 -0.01224664 1.00000000 0.1849915 0.40960966 0.12927006 -0.05212782
num_minorities 0.6635139 -0.16436478 0.39181075 0.48396222 0.35205109 0.48166887 0.28487870 0.18499148 1.0000000 0.88003337 0.80710870 -0.36836902
num_ymales 0.6675924 -0.25482183 0.43137206 0.48656695 0.28828811 0.56497500 -0.05069759 0.40960966 0.8800334 1.00000000 0.81288832 -0.20665165
urban 0.4914645 -0.16861495 0.33004924 0.39889779 0.39785937 0.39272648 0.01619569 0.12927006 0.8071087 0.81288832 1.00000000 -0.08000341
west_proper -0.4514176 0.15187249 -0.15296513 -0.19939002 -0.19215618 -0.25035461 -0.62451443 -0.05212782 -0.3683690 -0.20665165 -0.08000341 1.00000000
Base model
base_md = lm(log(crmrte) ~ log(acj), data = data2)
plot(base_md)
H0: demographic factors do not have an impact on the model fit
md1 = lm(log(crmrte) ~ log(acj) + log(density) + log(pctymle) + pctmin80, data = data2)
plot(md1)
coeftest(md1)
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.9979154 0.5977845 -8.3607 1.061e-12 ***
log(acj) -0.4636576 0.0607214 -7.6358 3.043e-11 ***
log(density) 0.1787405 0.0269804 6.6248 2.999e-09 ***
log(pctymle) 0.0873188 0.1990884 0.4386 0.6621
pctmin80 0.0121271 0.0021797 5.5635 3.008e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
coeftest(md1, vcov = vcovHC)
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.9979154 0.5727402 -8.7263 1.934e-13 ***
log(acj) -0.4636576 0.1495965 -3.0994 0.00263 **
log(density) 0.1787405 0.2364262 0.7560 0.45173
log(pctymle) 0.0873188 0.1534316 0.5691 0.57079
pctmin80 0.0121271 0.0023181 5.2315 1.192e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Density and minority percetange are significant on explaing the variation on crime rate. Bot variables have a positive relationship with crime rate. % of young males does not explain the variation of crime rate.
ggplot(data2, aes(x= pctymle*density , y=log(crmrte + 1))) +
geom_point()+
geom_smooth(method=lm, se=FALSE)
interac_md2 = lm(log(crmrte) ~ log(acj) + log(pctymle)*log(density) + pctmin80, data = data2)
plot(interac_md2)
coeftest(interac_md2, vcov= vcovHC)
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.3111217 0.3238046 -16.4022 < 2.2e-16 ***
log(acj) -0.3427713 0.1046721 -3.2747 0.001537 **
pctymle 2.4606492 2.4707490 0.9959 0.322153
density 0.1974639 0.1243659 1.5878 0.116097
pctmin80 0.0111803 0.0022048 5.0710 2.328e-06 ***
pctymle:density -0.2173066 1.3803559 -0.1574 0.875285
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
md3 = lm(log(crmrte) ~ log(acj) + log(density*pctmin80) + log(pctymle), data = data2)
plot(md3)
coeftest(md3, vcov.= vcovHC)
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.030762 0.513855 -9.7902 1.214e-15 ***
log(acj) -0.411609 0.104502 -3.9388 0.000166 ***
log(density * pctmin80) 0.193798 0.086934 2.2293 0.028404 *
log(pctymle) 0.112892 0.132557 0.8516 0.396775
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
interact_md4 = lm(log(crmrte) ~ log(acj) + density*pctmin80 + log(pctymle), data = data2)
plot(interact_md4)
interac_md5 = lm(log(crmrte) ~ log(acj) + pctymle*density + pctmin80, data = data2)
plot(interac_md5)
se = sqrt(diag(vcovHC(base_md)))
se1 = sqrt(diag(vcovHC(md1)))
se2 = sqrt(diag(vcovHC(interac_md2)))
se3 = sqrt(diag(vcovHC(md3)))
se4 = sqrt(diag(vcovHC(interact_md4)))
se5 = sqrt(diag(vcovHC(interac_md5)))
stargazer(
base_md
, md1
, interac_md2
, md3
, interact_md4
, interac_md5
, type = "text"
, se = list(se, se1, se2, se3, se4, se5)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)
===================================================================================================================================================================
Dependent variable:
-----------------------------------------------------------------------------------------------------------------------------------------
log(crmrte)
(1) (2) (3) (4) (5) (6)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
log(acj) -0.470*** -0.464** -0.440** -0.412*** -0.341** -0.343**
(0.106) (0.150) (0.135) (0.105) (0.104) (0.105)
log(density) 0.179 1.774
(0.236) (1.610)
log(density * pctmin80) 0.194*
(0.087)
pctymle 2.461
(2.471)
density 0.232** 0.197
(0.083) (0.124)
log(pctymle) 0.087 -0.107 0.113 0.273
(0.153) (0.253) (0.133) (0.160)
density:pctmin80 -0.002
(0.003)
pctmin80 0.012*** 0.012*** 0.013*** 0.011***
(0.002) (0.003) (0.004) (0.002)
log(pctymle):log(density) 0.620
(0.698)
pctymle:density -0.217
(1.380)
Constant -4.937*** -4.998*** -5.424*** -5.031*** -4.467*** -5.311***
(0.296) (0.573) (0.865) (0.514) (0.575) (0.324)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------
Observations 90 90 90 90 90 90
R2 0.315 0.629 0.648 0.663 0.652 0.650
Adjusted R2 0.307 0.612 0.627 0.651 0.631 0.629
Residual Std. Error 0.457 (df = 88) 0.342 (df = 85) 0.335 (df = 84) 0.324 (df = 86) 0.333 (df = 84) 0.334 (df = 84)
F Statistic 40.481*** (df = 1; 88) 36.066*** (df = 4; 85) 30.864*** (df = 5; 84) 56.390*** (df = 3; 86) 31.479*** (df = 5; 84) 31.211*** (df = 5; 84)
===================================================================================================================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
demo_md1 = lm(log(crmrte) ~ log(acj) + pctmin80, data = data2)
plot(demo_md1)
demo_md2 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80), data = data2)
plot(demo_md2)
s2_demo1 = sqrt(diag(vcovHC(demo_md1)))
s2_demo2 = sqrt(diag(vcovHC(demo_md2)))
stargazer(
base_md
, demo_md1
, demo_md2
, type = "text"
, se = list(se, s2_demo1, s2_demo2)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)
============================================================================================
Dependent variable:
--------------------------------------------------------------------
log(crmrte)
(1) (2) (3)
--------------------------------------------------------------------------------------------
log(acj) -0.470*** -0.521*** -0.424***
(0.106) (0.093) (0.097)
pctmin80 0.011***
(0.002)
log(density * pctmin80) 0.195*
(0.087)
Constant -4.937*** -5.375*** -5.354***
(0.296) (0.249) (0.227)
--------------------------------------------------------------------------------------------
Observations 90 90 90
R2 0.315 0.430 0.662
Adjusted R2 0.307 0.416 0.654
Residual Std. Error 0.457 (df = 88) 0.419 (df = 87) 0.323 (df = 87)
F Statistic 40.481*** (df = 1; 88) 32.759*** (df = 2; 87) 85.029*** (df = 2; 87)
============================================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
geo_md1 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + west_proper, data = data2)
plot(geo_md1)
geo_md2 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + west_proper + urban, data = data2)
plot(geo_md2)
geo_md3 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + urban, data = data2)
plot(geo_md3)
se_geo1 = sqrt(diag(vcovHC(geo_md1)))
se_geo2 = sqrt(diag(vcovHC(geo_md2)))
se_geo3 = sqrt(diag(vcovHC(geo_md3)))
stargazer(
demo_md2
, geo_md1
, geo_md2
, geo_md3
, type = "text"
, se = list(s2_demo2, se_geo1, se_geo2)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)
===================================================================================================================
Dependent variable:
-------------------------------------------------------------------------------------------
log(crmrte)
(1) (2) (3) (4)
-------------------------------------------------------------------------------------------------------------------
log(acj) -0.424*** -0.423*** -0.385*** -0.392***
(0.097) (0.100) (0.087) (0.054)
log(density * pctmin80) 0.195* 0.185 0.157 0.179***
(0.087) (0.138) (0.161) (0.022)
west_proper -0.062 -0.123
(0.248) (0.295)
urban 0.302 0.261
(0.294) (0.134)
Constant -5.354*** -5.308*** -5.127*** -5.236***
(0.227) (0.347) (0.495) (0.173)
-------------------------------------------------------------------------------------------------------------------
Observations 90 90 90 90
R2 0.662 0.663 0.681 0.676
Adjusted R2 0.654 0.651 0.666 0.665
Residual Std. Error 0.323 (df = 87) 0.324 (df = 86) 0.317 (df = 85) 0.318 (df = 86)
F Statistic 85.029*** (df = 2; 87) 56.375*** (df = 3; 86) 45.339*** (df = 4; 85) 59.800*** (df = 3; 86)
===================================================================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
mix_md1 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(mix), data = data2)
plot(mix_md1)
se_mix1 = sqrt(diag(vcovHC(mix_md1)))
stargazer(
demo_md2
, mix_md1
, type = "text"
, se = list(s2_demo2, se_mix1)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)
=====================================================================
Dependent variable:
---------------------------------------------
log(crmrte)
(1) (2)
---------------------------------------------------------------------
log(acj) -0.424*** -0.423***
(0.097) (0.096)
log(density * pctmin80) 0.195* 0.199*
(0.087) (0.084)
log(mix) 0.078
(0.092)
Constant -5.354*** -5.191***
(0.227) (0.356)
---------------------------------------------------------------------
Observations 90 90
R2 0.662 0.667
Adjusted R2 0.654 0.656
Residual Std. Error 0.323 (df = 87) 0.322 (df = 86)
F Statistic 85.029*** (df = 2; 87) 57.537*** (df = 3; 86)
=====================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
eco_md1 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(taxpc), data = data2)
plot(eco_md1)
eco_md2 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(wloc), data = data2)
plot(eco_md2 )
eco_md3 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(wloc) + log(taxpc), data = data2)
plot(eco_md3)
eco_md4 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(wloc)*log(taxpc), data = data2)
plot(eco_md4)
eco_md5 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + taxpc + taxpc^2, data = data2)
plot(eco_md5)
eco_md6 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + wloc + wloc^2, data = data2)
plot(eco_md6)
eco_md7 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(wloc)*log(wloc), data = data2)
plot(eco_md7)
se_eco1 = sqrt(diag(vcovHC(eco_md1)))
se_eco2 = sqrt(diag(vcovHC(eco_md2)))
se_eco3 = sqrt(diag(vcovHC(eco_md3)))
se_eco4 = sqrt(diag(vcovHC(eco_md4)))
se_eco5 = sqrt(diag(vcovHC(eco_md5)))
se_eco6 = sqrt(diag(vcovHC(eco_md6)))
se_eco7 = sqrt(diag(vcovHC(eco_md7)))
stargazer(
eco_md1
, eco_md2
, eco_md3
, eco_md4
, eco_md5
, type = "text"
, se = list(s2_demo2, se_eco1, se_eco2, se_eco3, se_eco4, se_eco5)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)
==========================================================================================================================================
Dependent variable:
------------------------------------------------------------------------------------------------------------------
log(crmrte)
(1) (2) (3) (4) (5)
------------------------------------------------------------------------------------------------------------------------------------------
log(acj) -0.398*** -0.428*** -0.408*** -0.397*** -0.388***
(0.097) (0.089) (0.089) (0.083) (0.083)
log(density * pctmin80) 0.190* 0.181* 0.179* 0.172* 0.190*
(0.087) (0.085) (0.084) (0.083) (0.083)
log(taxpc) 0.254 0.192 -13.765***
(0.177)
log(wloc):log(taxpc) 2.417
log(wloc) 1.004 0.890 -7.712***
(0.515) (0.494)
taxpc 0.007
Constant -6.178*** -11.089*** -11.061*** 38.665*** -5.491
(0.227) (0.661) (2.867) (2.801) (40.400)
------------------------------------------------------------------------------------------------------------------------------------------
Observations 90 90 90 90 90
R2 0.675 0.687 0.694 0.703 0.685
Adjusted R2 0.664 0.676 0.680 0.686 0.674
Residual Std. Error 0.318 (df = 86) 0.312 (df = 86) 0.310 (df = 85) 0.308 (df = 84) 0.313 (df = 86)
F Statistic 59.628*** (df = 3; 86) 62.872*** (df = 3; 86) 48.279*** (df = 4; 85) 39.810*** (df = 5; 84) 62.469*** (df = 3; 86)
==========================================================================================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
stargazer(
eco_md6
, eco_md7
, type = "text"
, se = list(s2_demo2, se_eco5, se_eco6)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changed
============================================================
Dependent variable:
------------------------------
log(crmrte)
(1) (2)
------------------------------------------------------------
log(acj) -0.429*** -0.428***
(0.097) (0.090)
log(density * pctmin80) 0.182* 0.181*
(0.087) (0.085)
wloc 0.003
log(wloc) 1.004
Constant -6.294*** -11.089***
(0.227) (0.234)
------------------------------------------------------------
Observations 90 90
R2 0.685 0.687
Adjusted R2 0.674 0.676
Residual Std. Error (df = 86) 0.313 0.312
F Statistic (df = 3; 86) 62.304*** 62.872***
============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
m4 <- lm(log(crmrte) ~ log(acj) + log(avgsen) + log(density) + mix + pctymle + pctmin80 + west + central + urban + log(taxpc) , data = data2)
combined_md1 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(taxpc) + urban, data = data2)
plot(combined_md1)
combined_md1 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + log(taxpc) + urban, data = data2)
plot(combined_md1)
combined_md2 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + taxpc*taxpc + urban, data = data2)
plot(combined_md2)
combined_md3 = lm(log(crmrte) ~ log(acj) + log(density * pctmin80) + taxpc*taxpc + wloc*wloc + urban, data = data2)
plot(combined_md3)
se_m4 = sqrt(diag(vcovHC(m4)))
se_combined_md1 = sqrt(diag(vcovHC(combined_md1)))
se_combined_md2 = sqrt(diag(vcovHC(combined_md2)))
se_combined_md3 = sqrt(diag(vcovHC(combined_md3)))
stargazer(
demo_md2
, m4
, combined_md1
, combined_md2
, combined_md3
, type = "text"
, se = list(s2_demo2, se_m4, se_combined_md1)
, star.cutoffs = c(0.05, 0.01, 0.001)
)
length of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changedlength of NULL cannot be changednumber of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)number of rows of result is not a multiple of vector length (arg 2)
===========================================================================================================================================
Dependent variable:
-------------------------------------------------------------------------------------------------------------------
log(crmrte)
(1) (2) (3) (4) (5)
-------------------------------------------------------------------------------------------------------------------------------------------
log(acj) -0.424*** -0.384** -0.380*** -0.370*** -0.384***
(0.097) (0.132) (0.088) (0.054) (0.054)
log(density * pctmin80) 0.195* 0.179* 0.180*** 0.172***
(0.087) (0.091) (0.022) (0.022)
log(avgsen) -0.068
(0.159)
log(density) 0.142
(0.346)
mix -0.053
(0.715)
pctymle 1.690
(1.513)
pctmin80 0.009
(0.005)
west -0.126
(0.191)
central 0.009
(0.201)
taxpc 0.006* 0.005
(0.003) (0.003)
wloc 0.002
(0.001)
wtrd 0.001
(0.001)
urban 0.289 0.201 0.185 0.105
(0.533) (0.206) (0.136) (0.143)
log(taxpc) 0.210 0.191
(0.235) (0.234)
Constant -5.354*** -5.640*** -5.883*** -5.387*** -6.096***
(0.227) (0.985) (0.916) (0.184) (0.437)
-------------------------------------------------------------------------------------------------------------------------------------------
Observations 90 90 90 90 90
R2 0.662 0.669 0.683 0.692 0.704
Adjusted R2 0.654 0.627 0.668 0.678 0.683
Residual Std. Error 0.323 (df = 87) 0.335 (df = 79) 0.316 (df = 85) 0.312 (df = 85) 0.309 (df = 83)
F Statistic 85.029*** (df = 2; 87) 15.969*** (df = 10; 79) 45.775*** (df = 4; 85) 47.771*** (df = 4; 85) 32.959*** (df = 6; 83)
===========================================================================================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
plot(log(data2$crmrte), combined_md1$fitted.values, main = "Crime Rate - Actual vs Predicted")
abline(a=0,b=1)